ggplotly() and plot_ly()plot_geo()We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
sb_locs <- read_csv("starbucks-locations.csv", show_col_types = FALSE)
sb_nutr <- read_csv("starbucks-menu-nutrition.csv", show_col_types = FALSE)
usa_pop <- read_csv("us_state_pop.csv", show_col_types = FALSE)
usa_states<-read_csv("states.csv", show_col_types = FALSE)
skim(sb_locs)
| Name | sb_locs |
| Number of rows | 25600 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 11 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Brand | 0 | 1.00 | 7 | 21 | 0 | 4 | 0 |
| Store Number | 0 | 1.00 | 5 | 12 | 0 | 25599 | 0 |
| Store Name | 0 | 1.00 | 2 | 46 | 0 | 25364 | 0 |
| Ownership Type | 0 | 1.00 | 8 | 13 | 0 | 4 | 0 |
| Street Address | 2 | 1.00 | 1 | 165 | 0 | 25353 | 0 |
| City | 15 | 1.00 | 2 | 29 | 0 | 5469 | 0 |
| State/Province | 0 | 1.00 | 1 | 3 | 0 | 338 | 0 |
| Country | 0 | 1.00 | 2 | 2 | 0 | 73 | 0 |
| Postcode | 1522 | 0.94 | 1 | 9 | 0 | 18887 | 0 |
| Phone Number | 6861 | 0.73 | 1 | 15 | 0 | 18559 | 0 |
| Timezone | 0 | 1.00 | 18 | 30 | 0 | 101 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Longitude | 1 | 1 | -27.87 | 96.84 | -159.46 | -104.66 | -79.35 | 100.63 | 176.92 | ▇▇▂▂▅ |
| Latitude | 1 | 1 | 34.79 | 13.34 | -46.41 | 31.24 | 36.75 | 41.57 | 64.85 | ▁▁▁▇▂ |
skim(sb_nutr)
| Name | sb_nutr |
| Number of rows | 205 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Item | 0 | 1 | 10 | 65 | 0 | 187 | 0 |
| Category | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Calories | 0 | 1 | 257.24 | 158.36 | 0 | 130 | 250 | 380 | 650 | ▇▇▇▆▁ |
| Fat (g) | 0 | 1 | 10.06 | 9.66 | 0 | 0 | 7 | 18 | 37 | ▇▂▃▂▁ |
| Carb. (g) | 0 | 1 | 33.97 | 17.61 | 0 | 21 | 35 | 45 | 80 | ▃▆▇▃▁ |
| Fiber (g) | 0 | 1 | 1.77 | 2.63 | 0 | 0 | 1 | 3 | 21 | ▇▁▁▁▁ |
| Protein (g) | 0 | 1 | 8.19 | 8.19 | 0 | 1 | 6 | 13 | 34 | ▇▃▂▂▁ |
skim(usa_pop)
| Name | usa_pop |
| Number of rows | 55 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| state | 0 | 1 | 4 | 24 | 0 | 55 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| population | 0 | 1 | 5677621 | 6714730 | 56882 | 1344331 | 3751351 | 6515716 | 37253956 | ▇▂▁▁▁ |
skim(usa_states)
| Name | usa_states |
| Number of rows | 51 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| State | 0 | 1 | 4 | 20 | 0 | 51 | 0 |
| Abbreviation | 0 | 1 | 2 | 2 | 0 | 51 | 0 |
sb_usa <- sb_locs |> filter(
Country == "US"
)
sb_locs_state <- sb_usa |>
group_by(
`State/Province`
) |>
rename(
state = 'State/Province'
) |>
summarize(
n_stores = n()
)
# need state abbreviations
usa_pop_abbr <-
full_join(usa_pop, usa_states,
by = join_by(state == State))
sb_locs_state <- full_join(
sb_locs_state,
usa_pop_abbr,
by = join_by(state==Abbreviation)
)
summary(sb_locs_state)
## state n_stores state.y population
## Length:55 Min. : 8.0 Length:55 Min. : 56882
## Class :character 1st Qu.: 56.5 Class :character 1st Qu.: 1344331
## Mode :character Median : 123.0 Mode :character Median : 3751351
## Mean : 266.8 Mean : 5677621
## 3rd Qu.: 332.0 3rd Qu.: 6515716
## Max. :2821.0 Max. :37253956
## NA's :4
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
p1 <- ggplot(sb_locs_state, aes(x=population, y=n_stores, color=state)) +
geom_point(alpha= 0.8)
ggplotly(p1)
# Create the ggplot object with separate geom_histogram for food and drink
p2 <- ggplot() +
geom_histogram(data = subset(sb_nutr, Category == "Food"),
aes(x = Calories), bins = 25, fill = "blue", color = "black", alpha = 0.5) +
geom_histogram(data = subset(sb_nutr, Category == "Drinks"),
aes(x = Calories), bins = 25, fill = "red", color = "black", alpha = 0.5) +
labs(title = "Caloric Distribution of Starbucks Menu Items",
x = "Calories", y = "Frequency")
ggplotly(p2)
p3<- sb_nutr |>
unnest_tokens(word, Item) %>%
count(word, sort = TRUE) %>%
head(20) %>%
ggplot(aes(x = fct_reorder(word, n), y = n)) +
geom_col(fill = "blue") +
coord_flip() +
labs(title = "Top 20 Words in Starbucks Menu Items",
x = "Word",
y = "Frequency")
ggplotly(p3)
plot_ly()plot_ly() representing the
relationship between calories and carbssb_nutr %>%
plot_ly(x = ~Calories,
y = ~`Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~Category)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
hovermode = "compare"topwords <- sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
count(word, sort=T) %>%
head(10)
sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
filter(word %in% topwords$word) %>%
plot_ly(x = ~Calories,
y = ~`Carb. (g)`,
type = 'scatter',
mode = 'markers',
color = ~Category,
text = ~paste0("Item: ", word, "<br>Calories: ", Calories, "<br>Carb. (g): ", `Carb. (g)`),
hoverinfo = 'text'
) %>%
layout(title = "Calories vs Carbohydrates",
xaxis = list(title = "Calories"),
yaxis = list(title = "Carb. (g)"),
hovermode = "compare")
plot_ly Boxplotsfiltered_data <- sb_nutr %>%
unnest_tokens(word, Item, token = "words") %>%
filter(word %in% topwords$word)
boxplot <- filtered_data %>%
plot_ly(x = ~word, type = "box", y = ~Calories, name = "Calories", alpha = 0.5) %>%
add_boxplot(y = ~`Fat (g)`, name = "Fat", alpha = 0.5) %>%
add_boxplot(y = ~`Carb. (g)`, name = "Carbohydrates", alpha = 0.5) %>%
add_boxplot(y = ~`Fiber (g)`, name = "Fiber", alpha = 0.5) %>%
add_boxplot(y = ~`Protein (g)`, name = "Protein", alpha = 0.5) %>%
layout(title = "Boxplot of Nutritional Contents by Top Words in Item Names",
xaxis = list(title = "Top Words in Item Names"),
yaxis = list(title = "Value"),
hovermode = "compare")
# Create subplots for each boxplot
boxplot
filtered_data %>%
plot_ly(x=~Calories,
y=~`Carb. (g)`,
z=~`Protein (g)`,
type="scatter3d",
mode="markers",
color=~word)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", n_stores, '<br>', "State: ", state.y, '<br>', "Population: ", population))
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode="USA-states") %>%
add_trace(z=~n_stores, text=~hover,locations=~state,color=~n_stores, colors="Purples") %>%
layout(title="Starbucks stores by state", geo=set_map_details)
map2 <- plot_geo(sb_locs_state, locationmode="USA-states") %>%
add_trace(z = ~population, locations = ~state, text = ~hover, color = ~population, colors = 'Reds') %>%
colorbar(title = "Population") %>%
layout(title = "Population by State", geo = set_map_details)
subplot(map1, map2)
## Warning: Ignoring 4 observations
It seems that for states with a higher population, the starbucks store count is also higher, such as California, New York, Texas and Florida.